This report documents the final, robust workflow for analyzing a curated dataset of 704 clathrate CIF files. The goal is to calculate the weighted average network bond distance for each structure and explore its relationship with the lattice parameter ‘a’. The methodology incorporates a multi-step filtering pipeline to handle the complexities of real-world crystallographic data, such as site-occupancy disorder and non-physical “ghost” distances.
The curated dataset is organized into subdirectories defining the
specific filtering rules for the files within (e.g.,
6c-16i-24k). The script recursively finds all CIF files and
maps each one to its structural category.
cif_root_dir <- "/Users/donngo/repos/ml-crystals/notebook/workflow/Verified"
if (!dir.exists(cif_root_dir)) stop(paste("CRITICAL ERROR: Root directory not found:", cif_root_dir))
category_dirs <- list.dirs(path = cif_root_dir, full.names = TRUE, recursive = FALSE)
file_map_list <- lapply(category_dirs, function(dir) {
full_paths <- list.files(path = dir, pattern = "\\.cif$", full.names = TRUE, ignore.case = TRUE)
if (length(full_paths) == 0) return(NULL)
data.table(file_path = full_paths, file_name = basename(full_paths), category = basename(dir))
})
file_map <- rbindlist(file_map_list, fill = TRUE)
all_cif_paths <- if (nrow(file_map) > 0) file_map$file_path else character(0)
cat("Found", length(all_cif_paths), "total CIF files across", length(category_dirs), "categories.\n")
## Found 704 total CIF files across 10 categories.
if (length(all_cif_paths) == 0) stop("Execution stopped: No CIF files were found.")
We use the main analyze_cif_files() function with
optimized parameters to extract raw data and calculate all interatomic
distances. Subsequent bonding algorithms and non-essential calculations
are disabled to improve performance, as these will be handled by our
custom pipeline.
analysis_results <- analyze_cif_files(
file_paths = all_cif_paths,
perform_extraction = TRUE,
perform_calcs_and_transforms = TRUE,
bonding_algorithms = "none", # Disable default bonding to use our custom method
calculate_bond_angles = FALSE,
perform_error_propagation = FALSE
)
cat("Optimized analysis complete. Raw results table has", nrow(analysis_results), "rows.\n")
## Optimized analysis complete. Raw results table has 704 rows.
This section implements the core filtering and calculation logic. The robust workflow (Guests -> Ghosts -> Bonds -> Average) is applied to each file to ensure accurate and reliable results.
# --- Timing Start ---
timing_processing <- system.time({
guest_atoms <- c("Na", "K", "Rb", "Cs", "Sr", "Ba", "Eu")
all_removed_ghosts <- list()
process_file_results <- function(i) {
current_filename <- analysis_results$file_name[i]
category_info <- file_map[file_name == current_filename]
if (is.null(current_filename) ||
nrow(category_info) == 0)
return(NULL)
distances <- analysis_results$distances[[i]]
coords <- analysis_results$atomic_coordinates[[i]]
if (is.null(distances) ||
is.null(coords) || nrow(distances) == 0)
return(NULL)
# --- Step 1: Parse Category Rules and Filter Guests ---
category <- category_info$category
target_wyckoff_symbols <- strsplit(gsub("\\+M_on_.*", "", category), "-")[[1]]
# Unconditionally filter out guest atoms to analyze the host framework
distances <- filter_by_elements(distances, coords, guest_atoms)
if (nrow(distances) == 0)
return(NULL)
# --- Step 2: Filter "Ghost" Distances (Covalent Radii Method) ---
ghost_filter_result <- filter_ghost_distances(distances, coords, margin = 0.1)
cleaned_distances <- ghost_filter_result$kept
# Store removed distances for quality control
removed_table <- ghost_filter_result$removed
if (nrow(removed_table) > 0) {
removed_table[, file := current_filename]
all_removed_ghosts[[length(all_removed_ghosts) + 1]] <<- removed_table
}
if (nrow(cleaned_distances) == 0)
return(NULL)
# --- Step 3: Identify Bonded Pairs (Minimum Distance Method) ---
bonded_pairs <- minimum_distance(cleaned_distances, delta = 0.1)
if (nrow(bonded_pairs) == 0)
return(NULL)
# --- Step 4: Calculate Final Weighted Average ---
weighted_avg <- calculate_weighted_average_network_distance(bonded_pairs, coords, target_wyckoff_symbols)
if (is.na(weighted_avg))
return(NULL)
return(
data.table(
file = current_filename,
category = category,
lattice_parameter_a = analysis_results$unit_cell_metrics[[i]]$`_cell_length_a`,
weighted_distance = weighted_avg
)
)
}
# Execute the processing for all files
plot_data_list <- lapply(1:nrow(analysis_results), process_file_results)
plot_data <- rbindlist(plot_data_list, use.names = TRUE, fill = TRUE)
plot_data <- na.omit(plot_data)
removed_ghosts_summary <- rbindlist(all_removed_ghosts, fill = TRUE)
}) # --- Timing End ---
# Final summary output
cat("Data processing complete. Final plot table has",
nrow(plot_data),
"entries.\n")
## Data processing complete. Final plot table has 704 entries.
if (exists("removed_ghosts_summary") &&
nrow(removed_ghosts_summary) > 0) {
cat(
nrow(removed_ghosts_summary),
"non-physical 'ghost' distances were identified and removed across all files.\n"
)
}
## 5297226 non-physical 'ghost' distances were identified and removed across all files.
This section summarizes the interatomic distances that were automatically filtered out as non-physical before bond identification. A sample of up to 1,000 such distances is shown below, and the complete list is saved to a separate CSV file for full analysis.
export_dir <- "interactive_report_files"
dir.create(export_dir, showWarnings = FALSE)
if (exists("removed_ghosts_summary") && nrow(removed_ghosts_summary) > 0) {
full_csv_path <- file.path(export_dir, "removed_ghosts_summary.csv")
fwrite(removed_ghosts_summary, full_csv_path)
cat(paste("Full list of", nrow(removed_ghosts_summary), "removed distances saved to:", full_csv_path, "\n\n"))
sample_size <- min(1000, nrow(removed_ghosts_summary))
cat(paste("Displaying a sample of the first", sample_size, "removed distances below:\n"))
datatable(
removed_ghosts_summary[1:sample_size, ],
caption = paste("Sample of Removed Ghost Distances (", sample_size, " of ", nrow(removed_ghosts_summary), " total)"),
rownames = FALSE, extensions = 'Buttons',
options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('copy', 'csv')),
colnames = c("Atom 1", "Atom 2", "Removed Distance (Å)", "Expected (Å)", "Lower Bound (Å)", "Upper Bound (Å)", "Reason", "File")
)
} else {
cat("No 'ghost' distances were detected during the analysis.")
}
## Full list of 5297226 removed distances saved to: interactive_report_files/removed_ghosts_summary.csv
##
## Displaying a sample of the first 1000 removed distances below:
The plot below visualizes the final, cleaned data, showing the relationship between the calculated weighted average network bond length and the lattice parameter ‘a’.
p <- ggplot(plot_data, aes(x = lattice_parameter_a, y = weighted_distance, color = category, shape = category, text = file)) +
geom_point(alpha = 0.65, size = 2.5) +
geom_smooth(aes(group = 1), method = "lm", se = FALSE, color = "black", linetype = "dotted", fullrange = TRUE) +
labs(
title = "Average Network Bond Length vs. Lattice Parameter 'a'",
subtitle = "Data points are colored and shaped by their structural category",
x = "Lattice Parameter a (Å)",
y = "Weighted Average Network Bond Length (Å)",
color = "Structural Category", shape = "Structural Category"
) +
theme_bw(base_size = 14) +
theme(legend.position = "bottom", legend.title = element_text(face = "bold"))
interactive_plot <- ggplotly(p, tooltip = c("x", "y", "text", "color"))
interactive_plot
This table contains the final data used in the plot above. It can be searched, sorted, and exported.
datatable(
plot_data,
caption = "Final Processed Data for Clathrate Structures",
rownames = FALSE, filter = 'top', extensions = 'Buttons',
options = list(pageLength = 15, dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel')),
colnames = c("File", "Category", "Lattice 'a' (Å)", "Weighted Dist (Å)")
)
This final section saves all key results for sharing and archival.
plot_filename <- file.path(export_dir, "interactive_clathrate_plot.html")
saveWidget(interactive_plot, file = plot_filename, selfcontained = TRUE)
cat(paste0("Interactive plot saved to '", plot_filename, "'\n"))
## Interactive plot saved to 'interactive_report_files/interactive_clathrate_plot.html'
final_data_table <- datatable(
plot_data, caption = "Final Processed Data for Clathrate Structures",
extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel'))
)
plot_table_filename <- file.path(export_dir, "final_clathrate_data.html")
saveWidget(final_data_table, file = plot_table_filename, selfcontained = TRUE)
cat(paste0("Interactive plot data table saved to '", plot_table_filename, "'\n"))
## Interactive plot data table saved to 'interactive_report_files/final_clathrate_data.html'
detailed_export_path <- file.path(export_dir, "detailed_analysis_csvs")
export_analysis_to_csv(analysis_results = analysis_results, output_dir = detailed_export_path, overwrite = TRUE)
cat("...Done. Check the '", export_dir, "' folder for all outputs.\n")
## ...Done. Check the ' interactive_report_files ' folder for all outputs.